How PGSQL can facilitate the data processing workflow

Parfait Gasana

Data Analyst, Winston & Strawn
## Libraries
library(DBI)
Loading required package: methods
library(RPostgreSQL)
library(microbenchmark)

library(scales)
library(ggplot2)

options(repr.plot.width=15, repr.plot.height=5)
options(scipen=999)

Import CSV Data

csv_import <- function() {
    setwd("~/Documents/PGSQL")
    
    bus_month <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Monthly_Day-Type_Averages___Totals.csv')
    bus_rides <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Daily_Totals_by_Route.csv')

    rail_stops <- read.csv('CTA_-_System_Information_-_List_of__L__Stops.csv')
    rail_rides <- read.csv('CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv')
}

Import SQL Data

sql_import <- function() {
    conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="10.0.0.220", dbname="cta",
                      user="ctadba", password="cta18!", port=5432)

    bus_month <- dbGetQuery(conn, "SELECT * FROM bus_month")
    bus_rides <- dbGetQuery(conn, "SELECT * FROM bus_rides")

    rail_stops <- dbGetQuery(conn, "SELECT * FROM rail_stops")
    rail_rides <- dbGetQuery(conn, "SELECT * FROM rail_rides")

    dbDisconnect(conn)
}

summary(microbenchmark(csv_import))
summary(microbenchmark(sql_import))

View CSV Data

bus_month_csv <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Monthly_Day-Type_Averages___Totals.csv')
head(bus_month_csv)
bus_rides_csv <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Daily_Totals_by_Route.csv')
head(bus_rides_csv)
rail_stops_csv <- read.csv('CTA_-_System_Information_-_List_of__L__Stops.csv')
names(rail_stops_csv) <- tolower(names(rail_stops_csv))
head(rail_stops_csv)
rail_rides_csv <- read.csv('CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv')
head(rail_rides_csv)
bus_month_csv$Month_Beginning <- as.Date(bus_month_csv$Month_Beginning, format="%m/%d/%Y", origin="1970-01-01")

bus_rides_csv$date <- as.Date(bus_rides_csv$date, format="%m/%d/%Y", origin="1970-01-01")

rail_rides_csv$date <- as.Date(rail_rides_csv$date, format="%m/%d/%Y", origin="1970-01-01")

View SQL Data

conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="10.0.0.220", dbname="cta",
                  user="ctadba", password="cta18!", port=5432)

bus_month_sql <- dbGetQuery(conn, "SELECT * FROM bus_month")
head(bus_month_sql)
bus_rides_sql <- dbGetQuery(conn, "SELECT * FROM bus_rides")
head(bus_rides_sql)
rail_stations_sql <- dbGetQuery(conn, "SELECT * FROM rail_stations")
head(rail_stations_sql)
rail_rides_sql <- dbGetQuery(conn, "SELECT * FROM rail_rides")
head(rail_rides_sql)

Aggregate CSV Data

# MERGE
agg_csv <- merge(unique(bus_month_csv[c("route", "routename")]), bus_rides_csv, by="route")

# AGGREGATE
agg_csv <- do.call(data.frame, 
                   aggregate(rides ~ route + routename, agg_csv, 
                             function(x) c(count=length(x), sum=sum(x), mean=mean(x), 
                                           median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(-rides.sum),])

agg_csv

Top 5 Bus Routes

# MERGE
agg_csv <- merge(subset(unique(bus_month_csv[c("route", "routename")]), 
                        routename %in% c("79th", "Ashland", "Chicago", "Western", "Cottage Grove")),
                 transform(bus_rides_csv, year=format(date, "%Y")),
                 by="route")

# AGGREGATE
agg_csv <- do.call(data.frame, 
                   aggregate(rides ~ routename + year, agg_csv, 
                             function(x) c(count=length(x), sum=sum(x), mean=mean(x), 
                                           median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(routename, year),])

agg_csv

Graph CSV Data

seabornPalette <- c("#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
                    "#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2",
                    "#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd",
                    "#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
                    "#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2",
                    "#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd",
                    "#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
                    "#c44e52","#8172b2","#ccb974","#64b5cd")

ggplot(agg_csv, aes(year, rides.sum, fill=routename)) + geom_col(position = "dodge") +
  labs(title="Top 5 CTA 'L' Bus Routes", x="Year", y="Rides") +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

Aggregate SQL Data

sql <- 'SELECT rt.route_name, COUNT(rd.rides) AS "count", 
                              SUM(rd.rides) AS "sum", 
                              AVG(rd.rides) AS "mean", 
                              MEDIAN(rd.rides) AS "median",
                              MIN(rd.rides) AS "min", 
                              MAX(rd.rides) AS "max"
        FROM bus_routes rt
        INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
        GROUP BY rt.route_name
        ORDER BY SUM(rd.rides) DESC'

agg_sql <- dbGetQuery(conn, sql)

agg_sql
sql <- 'SELECT rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer AS "year", 
             COUNT(rd.rides) AS "count", 
             SUM(rd.rides) AS "sum", 
             AVG(rd.rides) AS "mean", 
             MEDIAN(rd.rides) AS "median",
             MIN(rd.rides) AS "min", 
             MAX(rd.rides) AS "max"
      FROM bus_routes rt
      INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
      WHERE rt.route_name IN (\'79th\', \'Ashland\', \'Chicago\', \'Western\', \'Cottage Grove\')
      GROUP BY rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer
      ORDER BY rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer'

agg_sql <- dbGetQuery(conn, sql)

agg_sql

Graph SQL Data

ggplot(agg_sql, aes(year, sum, color=route_name)) + 
  geom_line(stat="identity") + geom_point(stat="identity") +
  labs(title="Top 5 CTA 'L' Bus Routes", x="Year", y="Rides") +
  scale_x_continuous("year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_color_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

CSV Data Diagnostics

# TRANSFORM
agg_csv <- transform(rail_rides_csv, year=format(date, "%Y"))

# AGGREGATE
agg_csv <- do.call(data.frame, 
                   aggregate(rides ~ station_id + stationname + year, agg_csv, 
                             function(x) c(count=length(x), sum=sum(x), mean=mean(x), 
                                           median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(-rides.sum),])

agg_csv
# RESHAPE
rail_lines <- c("red", "blue", "green", "brown", "purple", "purple_exp", "yellow", "pink", "orange")

rail_long <- reshape(setNames(rail_stops_csv[c("map_id", "station_name", "location", "red", "blue", "g", 
                                               "brn", "p", "pexp", "y", "pnk", "o")],
                              c("station_id", "station_name", "location", rail_lines)),
                     varying = rail_lines, v.names = "value", 
                     timevar = "rail_line", times = rail_lines,
                     new.row.names = 1:10000, direction = "long")

# SUBSET
rail_long <- unique(subset(rail_long, value=='true')[c("station_id", "station_name", "location", "rail_line")])

# ORDER
rail_long <- with(rail_long, rail_long[order(station_id, rail_line),])
row.names(rail_long) <- NULL

rail_long
merge_rail <- merge(agg_csv, rail_long, by="station_id")

merge_rail$rides.total <- merge_rail$rides.sum / with(merge_rail, ave(station_id, station_id, year, FUN=length))

merge_rail
agg_csv <- aggregate(rides.total ~ year + rail_line, merge_rail, sum)
agg_csv <- with(agg_csv, agg_csv[order(rail_line, year),])
row.names(agg_csv) <- NULL

agg_csv

SQL Data Diagnostics

sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year", m.rail_line, SUM(m.rail_total)  AS rail_total
      FROM merge_rail m
      GROUP BY m."year", m.rail_line
      ORDER BY m.rail_line, m."year"'
  
agg_sql <- dbGetQuery(conn, sql)

agg_sql
cta_palette <- c(blue="#00A1DE", brown="#62361B", green="#009B3A", orange="#F9461C", pink="#E27EA6",
                 purple="#522398", purple_exp="#8059BA", red="#C60C30", yellow="#F9E300")

ggplot(subset(agg_sql, year > 2012), aes(year, rail_total, fill=rail_line)) + geom_col(position = "dodge") +
  labs(title="CTA System 'L' Lines - Total Rides By Year", x="Year", y="Rides") +
  scale_x_continuous(expand = c(0,0), "year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_fill_manual(values = cta_palette) + guides(fill=guide_legend("Rail Lines", nrow=1)) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

Distribution

sql <- 'SELECT r.station_id, r.ride_date, r.station_name, s.rail_line, r.rides,
               (r.rides / COUNT(*) OVER(PARTITION BY s.station_id, r.ride_date)) AS rides_total
        FROM rail_rides r
        INNER JOIN rail_stations s ON s.station_id = r.station_id'

hist_long <- dbGetQuery(conn, sql)

hist_long
ggplot(hist_long, aes(x=rides_total, fill=rail_line)) +
   geom_histogram(data=subset(hist_long, rail_line == 'red'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'blue'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'brown'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'green'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'orange'), bins=100) +
   scale_x_continuous(expand = c(0, 0)) +
   scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
   scale_fill_manual(values = c(red = "#C60C30", blue = "#00A1DE", brown = "#62361B",
                                green = "#009B3A", orange = "#F9461C")) +
   labs(title="CTA Ridership Distribution By Rail Line", fill="Rail Line") +
   theme(plot.title = element_text(hjust=0.5, size=18))

ggplot(transform(hist_long, year = format(ride_date, '%Y')), 
       aes(x=year, y=rides_total, fill=year)) + 
    geom_boxplot() + guides(fill=FALSE) +
    scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
    labs(title="CTA Ridership Boxplot By Year") +
    theme(plot.title = element_text(hjust=0.5, size=18))

ggplot(hist_long, aes(x=rail_line, y=rides_total, fill=rail_line)) + 
    geom_boxplot() + guides(fill=FALSE) +
    scale_fill_manual(values = cta_palette) +
    scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
    labs(title="CTA Ridership Boxplot By Rail Line") +
    theme(plot.title = element_text(hjust=0.5, size=18))

ggplot(subset(within(hist_long, { year <- format(ride_date, '%Y')
                                  year <- as.integer(as.character(year))
                                }),
              year >= 2015 & year <= 2017), 
       aes(x=factor(year), y=rides_total, fill=rail_line)) + 
    geom_boxplot() + guides(fill=guide_legend("Rail Lines", nrow=1)) +
    scale_fill_manual(values = cta_palette) +
    scale_x_discrete(expand = c(0, 0)) +
    scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
    labs(title="CTA Ridership Boxplot By Rail Line, 2015-2017", x="Year", y="Rides") +
    theme(legend.position="bottom", plot.title = element_text(hjust=0.5, size=18))

Correlation

sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year",
             SUM(rail_total) FILTER (WHERE rail_line = \'blue\') AS blue,
             SUM(rail_total) FILTER (WHERE rail_line = \'brown\') AS brown,
             SUM(rail_total) FILTER (WHERE rail_line = \'green\') AS green,
             SUM(rail_total) FILTER (WHERE rail_line = \'orange\') AS orange,
             SUM(rail_total) FILTER (WHERE rail_line = \'pink\') AS pink,
             SUM(rail_total) FILTER (WHERE rail_line = \'purple\') AS purple,
             SUM(rail_total) FILTER (WHERE rail_line = \'purple_exp\') AS purple_exp,
             SUM(rail_total) FILTER (WHERE rail_line = \'red\') AS red,
             SUM(rail_total) FILTER (WHERE rail_line = \'yellow\') AS yellow
      FROM merge_rail m
      GROUP BY m."year"
      ORDER BY m."year"'

wide_sql <- dbGetQuery(conn, sql)

wide_sql
cor(wide_sql[-1], use = "complete.obs", method="pearson")
                blue     brown     green    orange      pink    purple
blue       1.0000000 0.9735257 0.8909543 0.9490871 0.8764534 0.7780920
brown      0.9735257 1.0000000 0.8839102 0.9165204 0.8888845 0.7446475
green      0.8909543 0.8839102 1.0000000 0.9295203 0.8768352 0.7684971
orange     0.9490871 0.9165204 0.9295203 1.0000000 0.8697865 0.8742380
pink       0.8764534 0.8888845 0.8768352 0.8697865 1.0000000 0.5573137
purple     0.7780920 0.7446475 0.7684971 0.8742380 0.5573137 1.0000000
purple_exp 0.9707673 0.9785738 0.9264667 0.9722522 0.8994040 0.8304641
red        0.8931410 0.8892051 0.7947539 0.9334301 0.7611228 0.9142396
yellow     0.7415022 0.7744762 0.7812455 0.7698821 0.6214899 0.7375434
           purple_exp       red    yellow
blue        0.9707673 0.8931410 0.7415022
brown       0.9785738 0.8892051 0.7744762
green       0.9264667 0.7947539 0.7812455
orange      0.9722522 0.9334301 0.7698821
pink        0.8994040 0.7611228 0.6214899
purple      0.8304641 0.9142396 0.7375434
purple_exp  1.0000000 0.9355244 0.7854706
red         0.9355244 1.0000000 0.7241108
yellow      0.7854706 0.7241108 1.0000000

T-tests

combns <- as.list(outer(rail_lines, rail_lines, function(x, y) ifelse(x==y, NA, paste(x, y))))
combns <- lapply(combns[!is.na(combns)], function(x) strsplit(x, split=" ")[[1]])

ttest_matrix <- sapply(combns, function(x){
  t <- t.test(wide_sql[[x[1]]], wide_sql[[x[2]]])
  c(statistic = t$statistic, p_value = t$p.value)
  
})

ttest_df <- data.frame(x = sapply(combns, "[", 1),
                       y = sapply(combns, "[", 2),
                       statistic = ttest_matrix[1,],
                       p_value = ttest_matrix[2,])

ttest_df <- with(ttest_df, ttest_df[order(x, y),])
ttest_df
by(ttest_df, ttest_df$x, function(sub)
  
  ggplot(sub, aes(x, statistic, fill=y)) + geom_col(position = "dodge") +
    labs(title=paste0("CTA System 'L' Lines - Pairwise T-tests for ", 
                     toupper(substr(sub$x[[1]], 1, 1)), 
                     substr(sub$x[[1]], 2, nchar(as.character(sub$x[[1]]))), " Line"), 
         x="Rail Line", y="T-test Stat") +
    scale_x_discrete(expand = c(0,0)) + 
    scale_y_continuous(expand = c(0, 0), label=comma) +
    scale_fill_manual(values = cta_palette[!names(cta_palette)==sub$x[[1]]]) + 
    guides(fill=guide_legend("Rail Lines", nrow=1)) +
    theme(legend.position="bottom",
          plot.title = element_text(hjust=0.5, size=18, colour=cta_palette[names(cta_palette)==sub$x[[1]]]),
          axis.text.x = element_text(angle=0, hjust=0.5))
  )
ttest_df$x: blue

-------------------------------------------------------- 
ttest_df$x: brown

-------------------------------------------------------- 
ttest_df$x: green

-------------------------------------------------------- 
ttest_df$x: orange

-------------------------------------------------------- 
ttest_df$x: pink

-------------------------------------------------------- 
ttest_df$x: purple

-------------------------------------------------------- 
ttest_df$x: purple_exp

-------------------------------------------------------- 
ttest_df$x: red

-------------------------------------------------------- 
ttest_df$x: yellow

CSV Regression

Right-Hand Side Variables

  • Again, requires maintenance and storage on disk space

  • Again, requires load time from non-centralized paths

  • Again, requires any ad-hoc munging for usability

# Source: St. Louis Federal Reserve Bank
unemployment_df <- read.csv('Chicago_Unemployment_Rates.csv')
unemployment_df$Date <- as.Date(unemployment_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(unemployment_df)
# Source: U.S. Department of Energy: EIA
gas_prices_df <- read.csv('US_Gas_Prices.csv')
gas_prices_df$Date <- as.Date(gas_prices_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(gas_prices_df)
# Source: U.S. National Oceanic and Atmospheric Administration (NOAA)
weather_df <- read.csv('Chicago_Weather_Data.csv')
weather_df$Date <- as.Date(weather_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(weather_df)

Bus Model Data

bus_model_data <- merge(unique(bus_month_csv[c("route")]), bus_rides_csv, by="route")

bus_model_data <- merge(bus_model_data, unemployment_df, by.x='date', by.y='Date')
bus_model_data <- merge(bus_model_data, gas_prices_df, by.x='date', by.y='Date')
bus_model_data <- merge(bus_model_data, weather_df, by.x='date', by.y='Date')

head(bus_model_data)

Add Seasons

bus_model_data$normalized_dt <- as.POSIXlt(bus_model_data$date)
bus_model_data$normalized_dt$year <- bus_model_data$normalized_dt$year +
                                        (2099 - as.integer(format(bus_model_data$date, "%Y")))
bus_model_data$normalized_dt <- as.Date(bus_model_data$normalized_dt)


bus_model_data$season <- ifelse(bus_model_data$normalized_dt >= '2099-01-01' & 
                                   bus_model_data$normalized_dt  < '2099-03-19', 'winter',
                                ifelse(bus_model_data$normalized_dt >= '2099-03-20' & 
                                          bus_model_data$normalized_dt  < '2099-06-19', 'spring',
                                       ifelse(bus_model_data$normalized_dt >= '2099-06-20' & 
                                                 bus_model_data$normalized_dt  < '2099-09-19', 'summer',
                                              ifelse(bus_model_data$normalized_dt >= '2099-09-20' & 
                                                        bus_model_data$normalized_dt  < '2099-12-19', 'fall',
                                                     ifelse(bus_model_data$normalized_dt >= '2099-12-20' & 
                                                              bus_model_data$normalized_dt  < '2099-12-31', 'winter',
                                                            NA)
                                              )
                                       )
                                )
                         )
                                       
bus_model_data[sample(nrow(bus_model_data), 10), c("normalized_dt", "date", "season")]
bus_model_data$normalized_dt <- NULL

Bus Modeling (Ordinary Least Squares)

model <- lm(rides ~ daytype + season + UE_Rate + Gas_Price + AvgTemp + Precipitation + SnowDepth,
            data = bus_model_data)

summary(model)

Call:
lm(formula = rides ~ daytype + season + UE_Rate + Gas_Price + 
    AvgTemp + Precipitation + SnowDepth, data = bus_model_data)

Residuals:
   Min     1Q Median     3Q    Max 
 -7816  -5073  -2142   3488  37902 

Coefficients:
                Estimate Std. Error t value            Pr(>|t|)    
(Intercept)    5290.4643    53.4881  98.909 <0.0000000000000002 ***
daytypeU      -1514.5231    33.9281 -44.639 <0.0000000000000002 ***
daytypeW        720.1838    25.7966  27.918 <0.0000000000000002 ***
seasonspring   -258.1290    23.6427 -10.918 <0.0000000000000002 ***
seasonsummer   -655.5754    29.3822 -22.312 <0.0000000000000002 ***
seasonwinter   -375.9811    27.6194 -13.613 <0.0000000000000002 ***
UE_Rate          99.5818     4.3064  23.124 <0.0000000000000002 ***
Gas_Price       -18.3744    10.7936  -1.702              0.0887 .  
AvgTemp          12.1592     0.7392  16.450 <0.0000000000000002 ***
Precipitation  -346.6681    24.0609 -14.408 <0.0000000000000002 ***
SnowDepth        -9.8439     5.3401  -1.843              0.0653 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6484 on 638980 degrees of freedom
  (142589 observations deleted due to missingness)
Multiple R-squared:  0.01483,   Adjusted R-squared:  0.01482 
F-statistic:   962 on 10 and 638980 DF,  p-value: < 0.00000000000000022
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Bus Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-1600,1000) + 
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

Rail Model Data

Assign Latitude and Longitude

rail_long$latitude <- as.numeric(gsub("\\(", "", gsub(",", "", sapply(as.character(rail_long$location), 
                                                         function(x) strsplit(x, split="\\s+")[[1]][1])))
                                )

rail_long$longitude <- as.numeric(gsub("\\)", "", sapply(as.character(rail_long$location), 
                                            function(x) strsplit(x, split="\\s+")[[1]][2]))
                                  )

rail_long[sample(nrow(rail_long), 10), c("location", "latitude", "longitude")]
rail_model_data <- merge(rail_long, rail_rides_csv, by="station_id")

rail_model_data$raw <- rail_model_data$rides

rail_model_data$rides <- with(rail_model_data, rides /
                                  ave(station_id, station_id, date, FUN=length))

rail_model_data <- merge(rail_model_data, unemployment_df, by.x='date', by.y='Date')
rail_model_data <- merge(rail_model_data, gas_prices_df, by.x='date', by.y='Date')
rail_model_data <- merge(rail_model_data, weather_df, by.x='date', by.y='Date')

head(rail_model_data, 10)

Add Seasons

rail_model_data$normalized_dt <- as.POSIXlt(rail_model_data$date)
rail_model_data$normalized_dt$year <- rail_model_data$normalized_dt$year +
                                        (2099 - as.integer(format(rail_model_data$date, "%Y")))
rail_model_data$normalized_dt <- as.Date(rail_model_data$normalized_dt)


rail_model_data$season <- ifelse(rail_model_data$normalized_dt >= '2099-01-01' & 
                                   rail_model_data$normalized_dt  < '2099-03-19', 'winter',
                                ifelse(rail_model_data$normalized_dt >= '2099-03-20' & 
                                          rail_model_data$normalized_dt  < '2099-06-19', 'spring',
                                       ifelse(rail_model_data$normalized_dt >= '2099-06-20' & 
                                                 rail_model_data$normalized_dt  < '2099-09-19', 'summer',
                                              ifelse(rail_model_data$normalized_dt >= '2099-09-20' & 
                                                        rail_model_data$normalized_dt  < '2099-12-19', 'fall',
                                                     ifelse(rail_model_data$normalized_dt >= '2099-12-20' & 
                                                              rail_model_data$normalized_dt  < '2099-12-31', 'winter',
                                                            NA)
                                              )
                                       )
                                )
                         )
                                       
rail_model_data[sample(nrow(rail_model_data), 10), c("normalized_dt", "date", "season")]
rail_model_data$normalized_dt <- NULL

Rail Modeling (Ordinary Least Squares)

model <- lm(rides ~ daytype + season + latitude + longitude + rail_line + 
                    UE_Rate + Gas_Price + AvgTemp + Precipitation + SnowDepth, 
            data = rail_model_data)

summary(model)

Call:
lm(formula = rides ~ daytype + season + latitude + longitude + 
    rail_line + UE_Rate + Gas_Price + AvgTemp + Precipitation + 
    SnowDepth, data = rail_model_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6252.1  -933.1  -256.4   537.0 30569.2 

Coefficients:
                       Estimate  Std. Error t value             Pr(>|t|)
(Intercept)         -67697.5885   3581.3622  -18.90 < 0.0000000000000002
daytypeU              -453.5025      6.6974  -67.71 < 0.0000000000000002
daytypeW              1160.6410      5.3328  217.64 < 0.0000000000000002
seasonspring          -178.8823      5.3762  -33.27 < 0.0000000000000002
seasonsummer          -189.5721      6.6870  -28.35 < 0.0000000000000002
seasonwinter          -157.0931      6.2731  -25.04 < 0.0000000000000002
latitude             -3995.6159     36.9560 -108.12 < 0.0000000000000002
longitude            -2703.3418     44.1307  -61.26 < 0.0000000000000002
rail_linebrown       -1180.7391      7.2879 -162.01 < 0.0000000000000002
rail_linegreen       -2070.2382      6.9165 -299.32 < 0.0000000000000002
rail_lineorange      -1077.6339      8.3798 -128.60 < 0.0000000000000002
rail_linepink        -2211.9982      7.4465 -297.05 < 0.0000000000000002
rail_linepurple      -2027.7884     11.2705 -179.92 < 0.0000000000000002
rail_linepurple_exp  -1547.7298      7.7477 -199.77 < 0.0000000000000002
rail_linered          1640.2397      6.9847  234.83 < 0.0000000000000002
rail_lineyellow      -1400.7779     17.7956  -78.72 < 0.0000000000000002
UE_Rate                 -5.1186      0.9920   -5.16          0.000000247
Gas_Price              185.7348      2.4565   75.61 < 0.0000000000000002
AvgTemp                  5.0953      0.1679   30.34 < 0.0000000000000002
Precipitation          -82.5539      5.3670  -15.38 < 0.0000000000000002
SnowDepth                1.4842      1.2067    1.23                0.219
                       
(Intercept)         ***
daytypeU            ***
daytypeW            ***
seasonspring        ***
seasonsummer        ***
seasonwinter        ***
latitude            ***
longitude           ***
rail_linebrown      ***
rail_linegreen      ***
rail_lineorange     ***
rail_linepink       ***
rail_linepurple     ***
rail_linepurple_exp ***
rail_linered        ***
rail_lineyellow     ***
UE_Rate             ***
Gas_Price           ***
AvgTemp             ***
Precipitation       ***
SnowDepth              
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1850 on 1006966 degrees of freedom
  (223613 observations deleted due to missingness)
Multiple R-squared:  0.4033,    Adjusted R-squared:  0.4033 
F-statistic: 3.403e+04 on 20 and 1006966 DF,  p-value: < 0.00000000000000022
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Rail Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-4000, 2000) + 
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=45, vjust=0.95, hjust=0.95))

SQL Regression

Bus Modeling Data

CREATE MATERIALIZED VIEW Bus_Model_Data AS
    SELECT b.id, b.route_id, b.ride_date, b.day_type, b.rides, r.route_name, 
           CASE 
               WHEN b.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN b.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN b.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN b.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN b.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
     (
      SELECT id, route_id, day_type, rides, ride_date, 
             ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
      FROM bus_rides
     ) b
    INNER JOIN bus_routes r ON b.route_id = r.route_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = b.ride_date
    INNER JOIN gas_prices g ON g.gas_date = b.ride_date
    INNER JOIN weather_data w ON w.weather_date = b.ride_date
    ORDER BY b.ride_date, NULLIF(regexp_replace(b.route_id, '\D', '', 'g'), '')::int;
    
REFRESH MATERIALIZED VIEW Bus_Model_Data;
bus_model_data <- dbGetQuery(conn, "SELECT * FROM bus_model_data")

head(bus_model_data)

Bus Modeling (Ordinary Least Squares)

model <- lm(rides ~ day_type + season + ue_rate + gas_price + avg_temp + precipitation + snow_depth,
            data = bus_model_data)

summary(model)

Call:
lm(formula = rides ~ day_type + season + ue_rate + gas_price + 
    avg_temp + precipitation + snow_depth, data = bus_model_data)

Residuals:
   Min     1Q Median     3Q    Max 
 -7813  -5072  -2142   3487  37904 

Coefficients:
                Estimate Std. Error t value            Pr(>|t|)    
(Intercept)    5290.1066    52.9123  99.979 <0.0000000000000002 ***
day_typeU     -1512.5779    33.7077 -44.873 <0.0000000000000002 ***
day_typeW       719.1988    25.6034  28.090 <0.0000000000000002 ***
seasonspring   -259.2512    23.5676 -11.000 <0.0000000000000002 ***
seasonsummer   -644.0360    29.1817 -22.070 <0.0000000000000002 ***
seasonwinter   -384.3928    27.1703 -14.148 <0.0000000000000002 ***
ue_rate          99.6607     4.2720  23.329 <0.0000000000000002 ***
gas_price       -19.6097    10.7135  -1.830              0.0672 .  
avg_temp         12.1726     0.7301  16.673 <0.0000000000000002 ***
precipitation  -350.3917    23.6077 -14.842 <0.0000000000000002 ***
snow_depth       -9.6659     5.3147  -1.819              0.0690 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6483 on 648166 degrees of freedom
  (133403 observations deleted due to missingness)
Multiple R-squared:  0.01478,   Adjusted R-squared:  0.01477 
F-statistic: 972.7 on 10 and 648166 DF,  p-value: < 0.00000000000000022
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Bus Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-1600, 1000) + 
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

Rail Modeling Data

CREATE MATERIALIZED VIEW Rail_Model_Data AS
    SELECT r.id, r.station_id, r.station_name, r.ride_date, r.day_type, r.rides AS raw, 
          (r.rides / COUNT(*) OVER(PARTITION BY r.station_id, r.ride_date)) AS rides,
          CASE 
               WHEN r.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN r.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN r.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN r.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN r.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,        
           REPLACE(REPLACE((regexp_split_to_array(s.location, '\s+'))[1], ',', ''), '(', '')::numeric AS latitude,
           REPLACE((regexp_split_to_array(s.location, '\s+'))[2], ')', '')::numeric AS longitude,
           s.rail_line, s.ada, s.direction,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
       (
        SELECT id, station_id, station_name, day_type, rides, ride_date, 
               ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
        FROM rail_rides
       )r
    INNER JOIN rail_stations s ON s.station_id = r.station_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = r.ride_date
    INNER JOIN gas_prices g ON g.gas_date = r.ride_date
    INNER JOIN weather_data w ON w.weather_date = r.ride_date
    ORDER BY r.ride_date, r.station_id;
    
REFRESH MATERIALIZED VIEW Rail_Model_Data;
rail_model_data <- dbGetQuery(conn, "SELECT * FROM rail_model_data")

head(rail_model_data)

Rail Modeling (Ordinary Least Squares)

model <- lm(rides ~ day_type + season + latitude + longitude + rail_line + 
                    ue_rate + gas_price + avg_temp + precipitation + snow_depth, 
            data = rail_model_data)

summary(model)

Call:
lm(formula = rides ~ day_type + season + latitude + longitude + 
    rail_line + ue_rate + gas_price + avg_temp + precipitation + 
    snow_depth, data = rail_model_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6249.5  -932.9  -256.2   536.9 30570.4 

Coefficients:
                       Estimate  Std. Error  t value             Pr(>|t|)
(Intercept)         -68149.8607   3556.3642  -19.163 < 0.0000000000000002
day_typeU             -454.4689      6.6549  -68.290 < 0.0000000000000002
day_typeW             1157.9025      5.2941  218.717 < 0.0000000000000002
seasonspring          -177.8150      5.3603  -33.172 < 0.0000000000000002
seasonsummer          -185.4726      6.6439  -27.916 < 0.0000000000000002
seasonwinter          -160.7995      6.1740  -26.044 < 0.0000000000000002
latitude             -3995.1405     36.6987 -108.863 < 0.0000000000000002
longitude            -2708.3027     43.8227  -61.801 < 0.0000000000000002
rail_linebrown       -1180.4292      7.2371 -163.109 < 0.0000000000000002
rail_linegreen       -2069.6169      6.8682 -301.334 < 0.0000000000000002
rail_lineorange      -1077.4023      8.3213 -129.476 < 0.0000000000000002
rail_linepink        -2211.5330      7.3945 -299.076 < 0.0000000000000002
rail_linepurple      -2027.3643     11.1919 -181.146 < 0.0000000000000002
rail_linepurple_exp  -1547.6652      7.6936 -201.163 < 0.0000000000000002
rail_linered          1641.3142      6.9360  236.637 < 0.0000000000000002
rail_lineyellow      -1400.4416     17.6707  -79.252 < 0.0000000000000002
ue_rate                 -5.3018      0.9843   -5.387         0.0000000718
gas_price              185.5411      2.4399   76.046 < 0.0000000000000002
avg_temp                 5.0833      0.1659   30.635 < 0.0000000000000002
precipitation          -83.1416      5.2864  -15.727 < 0.0000000000000002
snow_depth               1.6746      1.2015    1.394                0.163
                       
(Intercept)         ***
day_typeU           ***
day_typeW           ***
seasonspring        ***
seasonsummer        ***
seasonwinter        ***
latitude            ***
longitude           ***
rail_linebrown      ***
rail_linegreen      ***
rail_lineorange     ***
rail_linepink       ***
rail_linepurple     ***
rail_linepurple_exp ***
rail_linered        ***
rail_lineyellow     ***
ue_rate             ***
gas_price           ***
avg_temp            ***
precipitation       ***
snow_depth             
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1850 on 1021214 degrees of freedom
  (209365 observations deleted due to missingness)
Multiple R-squared:  0.4031,    Adjusted R-squared:  0.4031 
F-statistic: 3.449e+04 on 20 and 1021214 DF,  p-value: < 0.00000000000000022
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Rail Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-4000, 2000) + 
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=45, vjust=0.95, hjust=0.95))

# DISCONNECT FROM DATABASE
dbDisconnect(conn)
[1] TRUE

Conclusions




---
title: "Leveraging PostgreSQL in Data Science with R"
output: html_notebook
---

<div style="float:right"><img src="IMAGES/postgresql_r.png" height="200" width="200"/></div>

<div style="font-size: 24px;">How PGSQL can facilitate the data processing workflow</div>
## Parfait Gasana ##
<div style="font-size: 20px;">Data Analyst, Winston & Strawn</div>

<style type="text/css">
div.blue pre { background-color: #EBF4FA; }
.main-container {
  max-width: 1000px;
  margin-left: auto;
  margin-right: auto;
}
</style>

```{r}
## Libraries
library(DBI)
library(RPostgreSQL)
library(microbenchmark)

library(scales)
library(ggplot2)

options(repr.plot.width=15, repr.plot.height=5)
options(scipen=999)

```

## <span style="color: #646464">Import CSV Data</span>

- #### <span style="color: #646464">Often one of the largest time and resource-intensive steps for data analysts</span> ####
- #### <span style="color: #646464">Requires disk space on shared network or local drives</span> ####
- #### <span style="color: #646464">Multiple files usually unmanaged across versions and teams</span> ####

```{r}
csv_import <- function() {
    setwd("~/Documents/PGSQL")
    
    bus_month <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Monthly_Day-Type_Averages___Totals.csv')
    bus_rides <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Daily_Totals_by_Route.csv')

    rail_stops <- read.csv('CTA_-_System_Information_-_List_of__L__Stops.csv')
    rail_rides <- read.csv('CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv')
}
```

## <span style="color:#336791">Import SQL Data</span>

- ### <span style="color:#336791">Centralized data access locally or remotely</span> ###
    - #### <span style="color:#336791">Relational model ensures no redundancy</span> ####
- ### <span style="color:#336791">Secured and robust data storage</span> ###
    - #### <span style="color:#336791">Available backup with no need of local computers</span> ####
- ### <span style="color:#336791">Table and set relations seamless integrates into datasets</span> ###
    - #### <span style="color:#336791">atomic columns and diverse rowset</span> ####

<div class="blue">
```{r}
sql_import <- function() {
    conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="10.0.0.220", dbname="cta",
                      user="ctadba", password="cta18!", port=5432)

    bus_month <- dbGetQuery(conn, "SELECT * FROM bus_month")
    bus_rides <- dbGetQuery(conn, "SELECT * FROM bus_rides")

    rail_stops <- dbGetQuery(conn, "SELECT * FROM rail_stops")
    rail_rides <- dbGetQuery(conn, "SELECT * FROM rail_rides")

    dbDisconnect(conn)
}

summary(microbenchmark(csv_import))
summary(microbenchmark(sql_import))

```
</div>

## <span style="color: #646464">View CSV Data</span>

- ### <span style="color: #646464">Often requires ad-hoc munging: re-formatting or conversion of columns</span> ###
- ### <span style="color: #646464">Cleaning out reports and metadata from actual data on the fly</span> ###
- ### <span style="color: #646464">Adjusting different levels and reshaping for analytical purposes</span> ###

```{r}
bus_month_csv <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Monthly_Day-Type_Averages___Totals.csv')
head(bus_month_csv)

bus_rides_csv <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Daily_Totals_by_Route.csv')
head(bus_rides_csv)

rail_stops_csv <- read.csv('CTA_-_System_Information_-_List_of__L__Stops.csv')
names(rail_stops_csv) <- tolower(names(rail_stops_csv))
head(rail_stops_csv)

rail_rides_csv <- read.csv('CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv')
head(rail_rides_csv)
```

```{r}
bus_month_csv$Month_Beginning <- as.Date(bus_month_csv$Month_Beginning, format="%m/%d/%Y", origin="1970-01-01")

bus_rides_csv$date <- as.Date(bus_rides_csv$date, format="%m/%d/%Y", origin="1970-01-01")

rail_rides_csv$date <- as.Date(rail_rides_csv$date, format="%m/%d/%Y", origin="1970-01-01")
```

## <span style="color:#336791">View SQL Data</span>

- ### <span style="color:#336791">Relational databases are planned systems</span> ###
- ### <span style="color:#336791">Constraints and referential integrity ensures stability and relationships</span> ###
- ### <span style="color:#336791">Query language provides expression, reliablibility and versatility</span> ###

<div class="blue">
```{r}
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="10.0.0.220", dbname="cta",
                  user="ctadba", password="cta18!", port=5432)

bus_month_sql <- dbGetQuery(conn, "SELECT * FROM bus_month")
head(bus_month_sql)

bus_rides_sql <- dbGetQuery(conn, "SELECT * FROM bus_rides")
head(bus_rides_sql)

rail_stations_sql <- dbGetQuery(conn, "SELECT * FROM rail_stations")
head(rail_stations_sql)

rail_rides_sql <- dbGetQuery(conn, "SELECT * FROM rail_rides")
head(rail_rides_sql)

```
</div>


## <span style="color: #646464">Aggregate CSV Data</span> 

- ### <span style="color: #646464">Data tool syntax can be formidable for newcomers or returning users</span> ###
- ### <span style="color: #646464">Complex processes require long piping/chaining of objects and methods</span> ###
- ### <span style="color: #646464">Language lacks set-based (i.e., relations, join, union) framework</span> ###
  

```{r}
# MERGE
agg_csv <- merge(unique(bus_month_csv[c("route", "routename")]), bus_rides_csv, by="route")

# AGGREGATE
agg_csv <- do.call(data.frame, 
                   aggregate(rides ~ route + routename, agg_csv, 
                             function(x) c(count=length(x), sum=sum(x), mean=mean(x), 
                                           median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(-rides.sum),])

agg_csv
```


### <span style="color: #646464">Top 5 Bus Routes</span> 

```{r}
# MERGE
agg_csv <- merge(subset(unique(bus_month_csv[c("route", "routename")]), 
                        routename %in% c("79th", "Ashland", "Chicago", "Western", "Cottage Grove")),
                 transform(bus_rides_csv, year=format(date, "%Y")),
                 by="route")

# AGGREGATE
agg_csv <- do.call(data.frame, 
                   aggregate(rides ~ routename + year, agg_csv, 
                             function(x) c(count=length(x), sum=sum(x), mean=mean(x), 
                                           median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(routename, year),])

agg_csv
```


## <span style="color: #646464">Graph CSV Data</span> 

```{r fig1, fig.height = 4, fig.width = 10, fig.align = "center"}

seabornPalette <- c("#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
                    "#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2",
                    "#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd",
                    "#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
                    "#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2",
                    "#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd",
                    "#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
                    "#c44e52","#8172b2","#ccb974","#64b5cd")

ggplot(agg_csv, aes(year, rides.sum, fill=routename)) + geom_col(position = "dodge") +
  labs(title="Top 5 CTA 'L' Bus Routes", x="Year", y="Rides") +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```

## <span style="color:#336791">Aggregate SQL Data</span> 

- ### <span style="color:#336791">Clear, compact declarative language with portability</span> ####
- ### <span style="color:#336791">Processing with virtual tables occurs behind the scene</span> ###
- ### <span style="color:#336791">Set-based framework facilitates blockwise, vectorized process</span> ###

<div class="blue">
```{r}
sql <- 'SELECT rt.route_name, COUNT(rd.rides) AS "count", 
                              SUM(rd.rides) AS "sum", 
                              AVG(rd.rides) AS "mean", 
                              MEDIAN(rd.rides) AS "median",
                              MIN(rd.rides) AS "min", 
                              MAX(rd.rides) AS "max"
        FROM bus_routes rt
        INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
        GROUP BY rt.route_name
        ORDER BY SUM(rd.rides) DESC'

agg_sql <- dbGetQuery(conn, sql)

agg_sql
```

```{r}
sql <- 'SELECT rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer AS "year", 
             COUNT(rd.rides) AS "count", 
             SUM(rd.rides) AS "sum", 
             AVG(rd.rides) AS "mean", 
             MEDIAN(rd.rides) AS "median",
             MIN(rd.rides) AS "min", 
             MAX(rd.rides) AS "max"
      FROM bus_routes rt
      INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
      WHERE rt.route_name IN (\'79th\', \'Ashland\', \'Chicago\', \'Western\', \'Cottage Grove\')
      GROUP BY rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer
      ORDER BY rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer'

agg_sql <- dbGetQuery(conn, sql)

agg_sql
```
</div>

## <span style="color:#336791">Graph SQL Data</span>

<div class="blue">
```{r fig2, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(agg_sql, aes(year, sum, color=route_name)) + 
  geom_line(stat="identity") + geom_point(stat="identity") +
  labs(title="Top 5 CTA 'L' Bus Routes", x="Year", y="Rides") +
  scale_x_continuous("year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_color_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```
</div>

## <span style="color: #646464">CSV Data Diagnostics</span> ##

- ### <span style="color: #646464">Imperative nature of processing</span> ###
- ### <span style="color: #646464">Dense, nested calls for layered steps</span> ###
- ### <span style="color: #646464">Limited to application layer</span> ###

```{r}
# TRANSFORM
agg_csv <- transform(rail_rides_csv, year=format(date, "%Y"))

# AGGREGATE
agg_csv <- do.call(data.frame, 
                   aggregate(rides ~ station_id + stationname + year, agg_csv, 
                             function(x) c(count=length(x), sum=sum(x), mean=mean(x), 
                                           median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(-rides.sum),])

agg_csv
```


```{r}
# RESHAPE
rail_lines <- c("red", "blue", "green", "brown", "purple", "purple_exp", "yellow", "pink", "orange")

rail_long <- reshape(setNames(rail_stops_csv[c("map_id", "station_name", "location", "red", "blue", "g", 
                                               "brn", "p", "pexp", "y", "pnk", "o")],
                              c("station_id", "station_name", "location", rail_lines)),
                     varying = rail_lines, v.names = "value", 
                     timevar = "rail_line", times = rail_lines,
                     new.row.names = 1:10000, direction = "long")

# SUBSET
rail_long <- unique(subset(rail_long, value=='true')[c("station_id", "station_name", "location", "rail_line")])

# ORDER
rail_long <- with(rail_long, rail_long[order(station_id, rail_line),])
row.names(rail_long) <- NULL

rail_long
```


```{r}
merge_rail <- merge(agg_csv, rail_long, by="station_id")

merge_rail$rides.total <- merge_rail$rides.sum / with(merge_rail, ave(station_id, station_id, year, FUN=length))

merge_rail
```


```{r}
agg_csv <- aggregate(rides.total ~ year + rail_line, merge_rail, sum)
agg_csv <- with(agg_csv, agg_csv[order(rail_line, year),])
row.names(agg_csv) <- NULL

agg_csv
```

## <span style="color:#336791">SQL Data Diagnostics</span>

- ### <span style="color:#336791">Complex processing still readable and maintainable</span> ###
- ### <span style="color:#336791">CTEs clearly show underlying tables and views without helper objects</span> ###
- ### <span style="color:#336791">Window functions allow useful inline calculations</span> ###
- ### <span style="color:#336791">Conditional aggregations clearly show reshaped data</span> ###
- ### <span style="color:#336791">Seamless casting and conversion of types with `::` operator</span> ###

<div class="blue">
```{r}
sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year", m.rail_line, SUM(m.rail_total)  AS rail_total
      FROM merge_rail m
      GROUP BY m."year", m.rail_line
      ORDER BY m.rail_line, m."year"'
  
agg_sql <- dbGetQuery(conn, sql)

agg_sql
```


```{r fig3, fig.height = 4, fig.width = 10, fig.align = "center"}
cta_palette <- c(blue="#00A1DE", brown="#62361B", green="#009B3A", orange="#F9461C", pink="#E27EA6",
                 purple="#522398", purple_exp="#8059BA", red="#C60C30", yellow="#F9E300")

ggplot(subset(agg_sql, year > 2012), aes(year, rail_total, fill=rail_line)) + geom_col(position = "dodge") +
  labs(title="CTA System 'L' Lines - Total Rides By Year", x="Year", y="Rides") +
  scale_x_continuous(expand = c(0,0), "year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_fill_manual(values = cta_palette) + guides(fill=guide_legend("Rail Lines", nrow=1)) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```

</div>

## <span style="color:#336791;">Distribution</span

```{r}
sql <- 'SELECT r.station_id, r.ride_date, r.station_name, s.rail_line, r.rides,
               (r.rides / COUNT(*) OVER(PARTITION BY s.station_id, r.ride_date)) AS rides_total
        FROM rail_rides r
        INNER JOIN rail_stations s ON s.station_id = r.station_id'

hist_long <- dbGetQuery(conn, sql)

hist_long
```

```{r fig4, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(hist_long, aes(x=rides_total, fill=rail_line)) +
   geom_histogram(data=subset(hist_long, rail_line == 'red'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'blue'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'brown'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'green'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'orange'), bins=100) +
   scale_x_continuous(expand = c(0, 0)) +
   scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
   scale_fill_manual(values = c(red = "#C60C30", blue = "#00A1DE", brown = "#62361B",
                                green = "#009B3A", orange = "#F9461C")) +
   labs(title="CTA Ridership Distribution By Rail Line", fill="Rail Line") +
   theme(plot.title = element_text(hjust=0.5, size=18))
```

```{r fig5, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(transform(hist_long, year = format(ride_date, '%Y')), 
       aes(x=year, y=rides_total, fill=year)) + 
    geom_boxplot() + guides(fill=FALSE) +
    scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
    labs(title="CTA Ridership Boxplot By Year") +
    theme(plot.title = element_text(hjust=0.5, size=18))
```

```{r fig6, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(hist_long, aes(x=rail_line, y=rides_total, fill=rail_line)) + 
    geom_boxplot() + guides(fill=FALSE) +
    scale_fill_manual(values = cta_palette) +
    scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
    labs(title="CTA Ridership Boxplot By Rail Line") +
    theme(plot.title = element_text(hjust=0.5, size=18))
```

```{r fig7, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(subset(within(hist_long, { year <- format(ride_date, '%Y')
                                  year <- as.integer(as.character(year))
                                }),
              year >= 2015 & year <= 2017), 
       aes(x=factor(year), y=rides_total, fill=rail_line)) + 
    geom_boxplot() + guides(fill=guide_legend("Rail Lines", nrow=1)) +
    scale_fill_manual(values = cta_palette) +
    scale_x_discrete(expand = c(0, 0)) +
    scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
    labs(title="CTA Ridership Boxplot By Rail Line, 2015-2017", x="Year", y="Rides") +
    theme(legend.position="bottom", plot.title = element_text(hjust=0.5, size=18))
```


## <span style="color:#336791">Correlation</span>

<div class="blue">
```{r}
sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year",
             SUM(rail_total) FILTER (WHERE rail_line = \'blue\') AS blue,
             SUM(rail_total) FILTER (WHERE rail_line = \'brown\') AS brown,
             SUM(rail_total) FILTER (WHERE rail_line = \'green\') AS green,
             SUM(rail_total) FILTER (WHERE rail_line = \'orange\') AS orange,
             SUM(rail_total) FILTER (WHERE rail_line = \'pink\') AS pink,
             SUM(rail_total) FILTER (WHERE rail_line = \'purple\') AS purple,
             SUM(rail_total) FILTER (WHERE rail_line = \'purple_exp\') AS purple_exp,
             SUM(rail_total) FILTER (WHERE rail_line = \'red\') AS red,
             SUM(rail_total) FILTER (WHERE rail_line = \'yellow\') AS yellow
      FROM merge_rail m
      GROUP BY m."year"
      ORDER BY m."year"'

wide_sql <- dbGetQuery(conn, sql)

wide_sql
```
</div>

<div class="blue">
```{r}
cor(wide_sql[-1], use = "complete.obs", method="pearson")
```
</div>

## <span style="color:#336791">T-tests</span>

<div class="blue">
```{r}
combns <- as.list(outer(rail_lines, rail_lines, function(x, y) ifelse(x==y, NA, paste(x, y))))
combns <- lapply(combns[!is.na(combns)], function(x) strsplit(x, split=" ")[[1]])

ttest_matrix <- sapply(combns, function(x){
  t <- t.test(wide_sql[[x[1]]], wide_sql[[x[2]]])
  c(statistic = t$statistic, p_value = t$p.value)
  
})

ttest_df <- data.frame(x = sapply(combns, "[", 1),
                       y = sapply(combns, "[", 2),
                       statistic = ttest_matrix[1,],
                       p_value = ttest_matrix[2,])

ttest_df <- with(ttest_df, ttest_df[order(x, y),])
ttest_df
```


```{r fig.height = 4, fig.width = 10, fig.align = "center"}

by(ttest_df, ttest_df$x, function(sub)
  
  ggplot(sub, aes(x, statistic, fill=y)) + geom_col(position = "dodge") +
    labs(title=paste0("CTA System 'L' Lines - Pairwise T-tests for ", 
                     toupper(substr(sub$x[[1]], 1, 1)), 
                     substr(sub$x[[1]], 2, nchar(as.character(sub$x[[1]]))), " Line"), 
         x="Rail Line", y="T-test Stat") +
    scale_x_discrete(expand = c(0,0)) + 
    scale_y_continuous(expand = c(0, 0), label=comma) +
    scale_fill_manual(values = cta_palette[!names(cta_palette)==sub$x[[1]]]) + 
    guides(fill=guide_legend("Rail Lines", nrow=1)) +
    theme(legend.position="bottom",
          plot.title = element_text(hjust=0.5, size=18, colour=cta_palette[names(cta_palette)==sub$x[[1]]]),
          axis.text.x = element_text(angle=0, hjust=0.5))
  )

```
</div>

## <span style="color: #646464">CSV Regression</span> ##

### <span style="color: #646464">Right-Hand Side Variables</span> ##

- ### <span style="color: #646464">Again, requires maintenance and storage on disk space</span> ###
- ### <span style="color: #646464">Again, requires load time from non-centralized paths</span> ###
- ### <span style="color: #646464">Again, requires any ad-hoc munging for usability</span> ###

```{r}

# Source: St. Louis Federal Reserve Bank
unemployment_df <- read.csv('Chicago_Unemployment_Rates.csv')
unemployment_df$Date <- as.Date(unemployment_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(unemployment_df)

# Source: U.S. Department of Energy: EIA
gas_prices_df <- read.csv('US_Gas_Prices.csv')
gas_prices_df$Date <- as.Date(gas_prices_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(gas_prices_df)

# Source: U.S. National Oceanic and Atmospheric Administration (NOAA)
weather_df <- read.csv('Chicago_Weather_Data.csv')
weather_df$Date <- as.Date(weather_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(weather_df)
```


## <span style="color: #646464">Bus Model Data</span>

- ### <span style="color: #646464">Echoes set-based joins</span> ###
- ### <span style="color: #646464">Repetitive sourcing  of same object</span> ###
- ### <span style="color: #646464">Nested dense processing of steps</span> ###

```{r}
bus_model_data <- merge(unique(bus_month_csv[c("route")]), bus_rides_csv, by="route")

bus_model_data <- merge(bus_model_data, unemployment_df, by.x='date', by.y='Date')
bus_model_data <- merge(bus_model_data, gas_prices_df, by.x='date', by.y='Date')
bus_model_data <- merge(bus_model_data, weather_df, by.x='date', by.y='Date')

head(bus_model_data)
```


### <span style="color: #646464">Add Seasons</span>

```{r}
bus_model_data$normalized_dt <- as.POSIXlt(bus_model_data$date)
bus_model_data$normalized_dt$year <- bus_model_data$normalized_dt$year +
                                        (2099 - as.integer(format(bus_model_data$date, "%Y")))
bus_model_data$normalized_dt <- as.Date(bus_model_data$normalized_dt)


bus_model_data$season <- ifelse(bus_model_data$normalized_dt >= '2099-01-01' & 
                                   bus_model_data$normalized_dt  < '2099-03-19', 'winter',
                                ifelse(bus_model_data$normalized_dt >= '2099-03-20' & 
                                          bus_model_data$normalized_dt  < '2099-06-19', 'spring',
                                       ifelse(bus_model_data$normalized_dt >= '2099-06-20' & 
                                                 bus_model_data$normalized_dt  < '2099-09-19', 'summer',
                                              ifelse(bus_model_data$normalized_dt >= '2099-09-20' & 
                                                        bus_model_data$normalized_dt  < '2099-12-19', 'fall',
                                                     ifelse(bus_model_data$normalized_dt >= '2099-12-20' & 
                                                              bus_model_data$normalized_dt  < '2099-12-31', 'winter',
                                                            NA)
                                              )
                                       )
                                )
                         )
                                       
bus_model_data[sample(nrow(bus_model_data), 10), c("normalized_dt", "date", "season")]
bus_model_data$normalized_dt <- NULL
```

## <span style="color: #646464">Bus Modeling (Ordinary Least Squares)</span>

```{r}
model <- lm(rides ~ daytype + season + UE_Rate + Gas_Price + AvgTemp + Precipitation + SnowDepth,
            data = bus_model_data)

summary(model)
```

```{r fig8, fig.height = 5, fig.width = 10, fig.align = "center"}
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Bus Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-1600,1000) + 
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```


## <span style="color: #646464">Rail Model Data</span>

### <span style="color: #646464">Assign Latitude and Longitude</span>

```{r}
rail_long$latitude <- as.numeric(gsub("\\(", "", gsub(",", "", sapply(as.character(rail_long$location), 
                                                         function(x) strsplit(x, split="\\s+")[[1]][1])))
                                )

rail_long$longitude <- as.numeric(gsub("\\)", "", sapply(as.character(rail_long$location), 
                                            function(x) strsplit(x, split="\\s+")[[1]][2]))
                                  )

rail_long[sample(nrow(rail_long), 10), c("location", "latitude", "longitude")]
```

```{r}
rail_model_data <- merge(rail_long, rail_rides_csv, by="station_id")

rail_model_data$raw <- rail_model_data$rides

rail_model_data$rides <- with(rail_model_data, rides /
                                  ave(station_id, station_id, date, FUN=length))

rail_model_data <- merge(rail_model_data, unemployment_df, by.x='date', by.y='Date')
rail_model_data <- merge(rail_model_data, gas_prices_df, by.x='date', by.y='Date')
rail_model_data <- merge(rail_model_data, weather_df, by.x='date', by.y='Date')

head(rail_model_data, 10)
```


### <span style="color: #646464">Add Seasons</span>

```{r}
rail_model_data$normalized_dt <- as.POSIXlt(rail_model_data$date)
rail_model_data$normalized_dt$year <- rail_model_data$normalized_dt$year +
                                        (2099 - as.integer(format(rail_model_data$date, "%Y")))
rail_model_data$normalized_dt <- as.Date(rail_model_data$normalized_dt)


rail_model_data$season <- ifelse(rail_model_data$normalized_dt >= '2099-01-01' & 
                                   rail_model_data$normalized_dt  < '2099-03-19', 'winter',
                                ifelse(rail_model_data$normalized_dt >= '2099-03-20' & 
                                          rail_model_data$normalized_dt  < '2099-06-19', 'spring',
                                       ifelse(rail_model_data$normalized_dt >= '2099-06-20' & 
                                                 rail_model_data$normalized_dt  < '2099-09-19', 'summer',
                                              ifelse(rail_model_data$normalized_dt >= '2099-09-20' & 
                                                        rail_model_data$normalized_dt  < '2099-12-19', 'fall',
                                                     ifelse(rail_model_data$normalized_dt >= '2099-12-20' & 
                                                              rail_model_data$normalized_dt  < '2099-12-31', 'winter',
                                                            NA)
                                              )
                                       )
                                )
                         )
                                       
rail_model_data[sample(nrow(rail_model_data), 10), c("normalized_dt", "date", "season")]
rail_model_data$normalized_dt <- NULL
```



## <span style="color: #646464">Rail Modeling (Ordinary Least Squares)</span>

```{r}
model <- lm(rides ~ daytype + season + latitude + longitude + rail_line + 
                    UE_Rate + Gas_Price + AvgTemp + Precipitation + SnowDepth, 
            data = rail_model_data)

summary(model)
```

```{r fig9, fig.height = 5, fig.width = 10, fig.align = "center"}
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Rail Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-4000, 2000) + 
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=45, vjust=0.95, hjust=0.95))
```


## <span style="color:#336791">SQL Regression</span>

- ### <span style="color:#336791">Advanced preparation of data</span> ###
- ### <span style="color:#336791">Materialized view facilitates reproducible research</span> ###
- ### <span style="color:#336791">Compact and straightforward data sourcing</span> ###

## <span style="color:#336791">Bus Modeling Data</span>

<div class="blue">
```{sql, eval=FALSE}
CREATE MATERIALIZED VIEW Bus_Model_Data AS
    SELECT b.id, b.route_id, b.ride_date, b.day_type, b.rides, r.route_name, 
           CASE 
               WHEN b.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN b.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN b.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN b.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN b.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
     (
      SELECT id, route_id, day_type, rides, ride_date, 
             ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
      FROM bus_rides
     ) b
    INNER JOIN bus_routes r ON b.route_id = r.route_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = b.ride_date
    INNER JOIN gas_prices g ON g.gas_date = b.ride_date
    INNER JOIN weather_data w ON w.weather_date = b.ride_date
    ORDER BY b.ride_date, NULLIF(regexp_replace(b.route_id, '\D', '', 'g'), '')::int;
    
REFRESH MATERIALIZED VIEW Bus_Model_Data;
```

```{r}
bus_model_data <- dbGetQuery(conn, "SELECT * FROM bus_model_data")

head(bus_model_data)
```
</div>

## <span style="color:#336791">Bus Modeling (Ordinary Least Squares)</span>

<div class="blue">
```{r}
model <- lm(rides ~ day_type + season + ue_rate + gas_price + avg_temp + precipitation + snow_depth,
            data = bus_model_data)

summary(model)
```
</div>

```{r fig10, fig.height = 5, fig.width = 10, fig.align = "center"}
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Bus Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-1600, 1000) + 
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```


## <span style="color:#336791">Rail Modeling Data</span>

<div class="blue">
```{sql, eval=FALSE}
CREATE MATERIALIZED VIEW Rail_Model_Data AS
    SELECT r.id, r.station_id, r.station_name, r.ride_date, r.day_type, r.rides AS raw, 
          (r.rides / COUNT(*) OVER(PARTITION BY r.station_id, r.ride_date)) AS rides,
          CASE 
               WHEN r.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN r.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN r.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN r.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN r.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,        
           REPLACE(REPLACE((regexp_split_to_array(s.location, '\s+'))[1], ',', ''), '(', '')::numeric AS latitude,
           REPLACE((regexp_split_to_array(s.location, '\s+'))[2], ')', '')::numeric AS longitude,
           s.rail_line, s.ada, s.direction,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
       (
        SELECT id, station_id, station_name, day_type, rides, ride_date, 
               ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
        FROM rail_rides
       )r
    INNER JOIN rail_stations s ON s.station_id = r.station_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = r.ride_date
    INNER JOIN gas_prices g ON g.gas_date = r.ride_date
    INNER JOIN weather_data w ON w.weather_date = r.ride_date
    ORDER BY r.ride_date, r.station_id;
    
REFRESH MATERIALIZED VIEW Rail_Model_Data;
```

```{r}
rail_model_data <- dbGetQuery(conn, "SELECT * FROM rail_model_data")

head(rail_model_data)
```
</div>

## <span style="color:#336791">Rail Modeling (Ordinary Least Squares)</span>

<div class="blue">
```{r}
model <- lm(rides ~ day_type + season + latitude + longitude + rail_line + 
                    ue_rate + gas_price + avg_temp + precipitation + snow_depth, 
            data = rail_model_data)

summary(model)
```
</div>

```{r fig11, fig.height = 5, fig.width = 10, fig.align = "center"}
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Rail Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-4000, 2000) + 
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=45, vjust=0.95, hjust=0.95))
```


```{r}
# DISCONNECT FROM DATABASE
dbDisconnect(conn)
```

## <span style="color:#336791">Conclusions</span> ##
<div style="float:right"><img src="IMAGES/postgresql_r.png" height="200" width="200"/></div>

- ### <span style="color:#336791">Postgres provides a stable, centralized, repository for data sourcing</span> ###
- ### <span style="color:#336791">Postgres maintains an expressive SQL dialect for data processing</span> ###
- ### <span style="color:#336791">Postgres supports data science with vectorized methods and reproducibility</span> ###

<br/>
<br/>
<br/>


